{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "8e08d5a5",
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "from sqlalchemy import create_engine"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "1b497be5",
   "metadata": {},
   "outputs": [],
   "source": [
    "# 数据库地址：数据库放在上一级目录下\n",
    "db_path = os.path.join(os.path.dirname(os.getcwd()), \"data.db\")\n",
    "engine_path = \"sqlite:///\" + db_path\n",
    "# 创建数据库引擎\n",
    "engine = create_engine(engine_path)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "0c8ac4f3",
   "metadata": {},
   "source": [
    "### 获取数据"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "c2640f24",
   "metadata": {},
   "source": [
    ">user_id：用户id\n",
    ">\n",
    ">username：用户名\n",
    ">\n",
    ">age：年龄\n",
    ">\n",
    ">content：评论内容\n",
    ">\n",
    ">sentiment_value：情感值【0消极，1积极，-1未知】（用飞浆重写训练得到情感值）\n",
    ">\n",
    ">create_time：评论时间\n",
    ">\n",
    ">subject：评论主题"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "ea80e06d",
   "metadata": {},
   "outputs": [],
   "source": [
    "sql = \"\"\"\n",
    "select \n",
    "a.user_id\n",
    ",a.username\n",
    ",a.age\n",
    ",b.content\n",
    "--,b.sentiment_value\n",
    ",b.create_time\n",
    ",b.subject\n",
    "from\n",
    "users as a\n",
    "left join\n",
    "comment as b\n",
    "on a.user_id=b.user_id\n",
    "\"\"\"\n",
    "\n",
    "df = pd.read_sql(sql, engine)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "c8a7905e",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>user_id</th>\n",
       "      <th>username</th>\n",
       "      <th>age</th>\n",
       "      <th>content</th>\n",
       "      <th>create_time</th>\n",
       "      <th>subject</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>5926</th>\n",
       "      <td>5927</td>\n",
       "      <td>nBG8IF</td>\n",
       "      <td>28</td>\n",
       "      <td>又退烧啦</td>\n",
       "      <td>2021-06-03 00:00:00.000000</td>\n",
       "      <td>其他</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7728</th>\n",
       "      <td>7729</td>\n",
       "      <td>6N5qxO</td>\n",
       "      <td>39</td>\n",
       "      <td>看样子倒是像一张被锐化的图片，相机镜头本身并不太好。。。</td>\n",
       "      <td>2021-01-23 00:00:00.000000</td>\n",
       "      <td>其他</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1600</th>\n",
       "      <td>1601</td>\n",
       "      <td>BadocV</td>\n",
       "      <td>44</td>\n",
       "      <td>hd598是打游戏的监听耳机，声音粗</td>\n",
       "      <td>2021-11-28 00:00:00.000000</td>\n",
       "      <td>其他</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6682</th>\n",
       "      <td>6683</td>\n",
       "      <td>iSp995</td>\n",
       "      <td>45</td>\n",
       "      <td>放远点就没事了</td>\n",
       "      <td>2021-03-30 00:00:00.000000</td>\n",
       "      <td>其他</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4026</th>\n",
       "      <td>4027</td>\n",
       "      <td>ccaaYn</td>\n",
       "      <td>20</td>\n",
       "      <td>主要是散热问题，其他的还好，HiFi系统高频辐射几乎可以忽略</td>\n",
       "      <td>2021-11-17 00:00:00.000000</td>\n",
       "      <td>其他</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "      user_id username  age                         content  \\\n",
       "5926     5927   nBG8IF   28                            又退烧啦   \n",
       "7728     7729   6N5qxO   39    看样子倒是像一张被锐化的图片，相机镜头本身并不太好。。。   \n",
       "1600     1601   BadocV   44              hd598是打游戏的监听耳机，声音粗   \n",
       "6682     6683   iSp995   45                         放远点就没事了   \n",
       "4026     4027   ccaaYn   20  主要是散热问题，其他的还好，HiFi系统高频辐射几乎可以忽略   \n",
       "\n",
       "                     create_time subject  \n",
       "5926  2021-06-03 00:00:00.000000      其他  \n",
       "7728  2021-01-23 00:00:00.000000      其他  \n",
       "1600  2021-11-28 00:00:00.000000      其他  \n",
       "6682  2021-03-30 00:00:00.000000      其他  \n",
       "4026  2021-11-17 00:00:00.000000      其他  "
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.sample(5)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "d9250cf3",
   "metadata": {},
   "source": [
    "### 情感倾向"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "0bbfc0b3",
   "metadata": {},
   "source": [
    "用百度飞浆（paddlepaddle）模型库中的情感分析模型，将评论数据（content）转化为情感类别【积极1，消极0】"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "94df2f2c",
   "metadata": {},
   "source": [
    "#### senta_bilstm 模型"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "7db63abd",
   "metadata": {},
   "source": [
    ">**一、window10+anaconda3的安装命令：**\n",
    ">\n",
    ">conda install paddlepaddle==2.2.1 --channel https://mirrors.tuna.tsinghua.edu.cn/anaconda/cloud/Paddle/\n",
    ">\n",
    ">其他安装飞浆命令：[官网地址](https://www.paddlepaddle.org.cn/install/quick?docurl=/documentation/docs/zh/install/conda/windows-conda.html)\n",
    "> \n",
    ">**二、安装预训练模型应用工具 PaddleHub**\n",
    ">\n",
    ">pip install paddlehub==2.0.0\n",
    ">\n",
    ">飞浆模型库地址：[官网地址](https://www.paddlepaddle.org.cn/hublist)\n",
    ">\n",
    ">飞浆情感分析模型介绍：[官网地址](https://www.paddlepaddle.org.cn/hubdetail?name=senta_bilstm&en_category=SentimentAnalysis)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "e647e07e",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\u001b[33m[2022-01-19 15:39:47,490] [ WARNING]\u001b[0m - The _initialize method in HubModule will soon be deprecated, you can use the __init__() to handle the initialization of the object\u001b[0m\n",
      "\u001b[33m[2022-01-19 15:39:50,229] [ WARNING]\u001b[0m - The _initialize method in HubModule will soon be deprecated, you can use the __init__() to handle the initialization of the object\u001b[0m\n"
     ]
    }
   ],
   "source": [
    "import paddlehub as hub\n",
    "\n",
    "# 加载模型\n",
    "senta = hub.Module(name=\"senta_bilstm\")\n",
    "\n",
    "# 评论数据列表\n",
    "test_text = df[\"content\"].tolist()\n",
    "# 模型返回的结果\n",
    "results = senta.sentiment_classify(texts=test_text, use_gpu=False, batch_size=1)\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "975d01b6",
   "metadata": {},
   "source": [
    "#### 情感划分"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "f9846945",
   "metadata": {},
   "outputs": [],
   "source": [
    "# 将返回的结果转为 dataframe 数据，并拼接到原始数据中\n",
    "results_df = pd.DataFrame(results)\n",
    "\n",
    "df2 = pd.concat([df,results_df],axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "3b26af27",
   "metadata": {},
   "outputs": [],
   "source": [
    "# 将 negative_probs>=0.7 的定义为消极\n",
    "df2[\"new_sentiment_label\"] = df2[\"negative_probs\"].map(lambda x: 0 if x>=0.7 else 1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "ae14b268",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>user_id</th>\n",
       "      <th>username</th>\n",
       "      <th>age</th>\n",
       "      <th>content</th>\n",
       "      <th>create_time</th>\n",
       "      <th>subject</th>\n",
       "      <th>text</th>\n",
       "      <th>sentiment_label</th>\n",
       "      <th>sentiment_key</th>\n",
       "      <th>positive_probs</th>\n",
       "      <th>negative_probs</th>\n",
       "      <th>new_sentiment_label</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>3925</th>\n",
       "      <td>3926</td>\n",
       "      <td>w05S40</td>\n",
       "      <td>31</td>\n",
       "      <td>楼主在GA5上放这只玩具熊，以GA5的工作温度，不怕着火啊？</td>\n",
       "      <td>2021-03-20 00:00:00.000000</td>\n",
       "      <td>其他</td>\n",
       "      <td>楼主在GA5上放这只玩具熊，以GA5的工作温度，不怕着火啊？</td>\n",
       "      <td>0</td>\n",
       "      <td>negative</td>\n",
       "      <td>0.4157</td>\n",
       "      <td>0.5843</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>375</th>\n",
       "      <td>376</td>\n",
       "      <td>vYaCHf</td>\n",
       "      <td>28</td>\n",
       "      <td>日本电压是100V，市面上转110v的那种千万不要用</td>\n",
       "      <td>2021-11-10 00:00:00.000000</td>\n",
       "      <td>配置</td>\n",
       "      <td>日本电压是100V，市面上转110v的那种千万不要用</td>\n",
       "      <td>0</td>\n",
       "      <td>negative</td>\n",
       "      <td>0.3847</td>\n",
       "      <td>0.6153</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "      user_id username  age                         content  \\\n",
       "3925     3926   w05S40   31  楼主在GA5上放这只玩具熊，以GA5的工作温度，不怕着火啊？   \n",
       "375       376   vYaCHf   28      日本电压是100V，市面上转110v的那种千万不要用   \n",
       "\n",
       "                     create_time subject                            text  \\\n",
       "3925  2021-03-20 00:00:00.000000      其他  楼主在GA5上放这只玩具熊，以GA5的工作温度，不怕着火啊？   \n",
       "375   2021-11-10 00:00:00.000000      配置      日本电压是100V，市面上转110v的那种千万不要用   \n",
       "\n",
       "      sentiment_label sentiment_key  positive_probs  negative_probs  \\\n",
       "3925                0      negative          0.4157          0.5843   \n",
       "375                 0      negative          0.3847          0.6153   \n",
       "\n",
       "      new_sentiment_label  \n",
       "3925                    1  \n",
       "375                     1  "
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df2[df2[\"sentiment_label\"]!=df2[\"new_sentiment_label\"]].sample(2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8bc79e6f",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "id": "cb331fc2",
   "metadata": {},
   "source": [
    "### 数据描述"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "8dc92112",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>user_id</th>\n",
       "      <th>username</th>\n",
       "      <th>age</th>\n",
       "      <th>content</th>\n",
       "      <th>create_time</th>\n",
       "      <th>subject</th>\n",
       "      <th>text</th>\n",
       "      <th>sentiment_label</th>\n",
       "      <th>sentiment_key</th>\n",
       "      <th>positive_probs</th>\n",
       "      <th>negative_probs</th>\n",
       "      <th>new_sentiment_label</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>6013</th>\n",
       "      <td>6014</td>\n",
       "      <td>L1Z1UD</td>\n",
       "      <td>21</td>\n",
       "      <td>前几天听了下WA7 一分钟不到 没听出来什么 要是有机会 还想多听听woo家的东西</td>\n",
       "      <td>2021-04-15 00:00:00.000000</td>\n",
       "      <td>其他</td>\n",
       "      <td>前几天听了下WA7 一分钟不到 没听出来什么 要是有机会 还想多听听woo家的东西</td>\n",
       "      <td>1</td>\n",
       "      <td>positive</td>\n",
       "      <td>0.6008</td>\n",
       "      <td>0.3992</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "      user_id username  age                                    content  \\\n",
       "6013     6014   L1Z1UD   21  前几天听了下WA7 一分钟不到 没听出来什么 要是有机会 还想多听听woo家的东西   \n",
       "\n",
       "                     create_time subject  \\\n",
       "6013  2021-04-15 00:00:00.000000      其他   \n",
       "\n",
       "                                           text  sentiment_label  \\\n",
       "6013  前几天听了下WA7 一分钟不到 没听出来什么 要是有机会 还想多听听woo家的东西                1   \n",
       "\n",
       "     sentiment_key  positive_probs  negative_probs  new_sentiment_label  \n",
       "6013      positive          0.6008          0.3992                    1  "
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df2.sample()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "55fe1435",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 10000 entries, 0 to 9999\n",
      "Data columns (total 12 columns):\n",
      " #   Column               Non-Null Count  Dtype  \n",
      "---  ------               --------------  -----  \n",
      " 0   user_id              10000 non-null  int64  \n",
      " 1   username             10000 non-null  object \n",
      " 2   age                  10000 non-null  int64  \n",
      " 3   content              10000 non-null  object \n",
      " 4   create_time          10000 non-null  object \n",
      " 5   subject              10000 non-null  object \n",
      " 6   text                 10000 non-null  object \n",
      " 7   sentiment_label      10000 non-null  int64  \n",
      " 8   sentiment_key        10000 non-null  object \n",
      " 9   positive_probs       10000 non-null  float64\n",
      " 10  negative_probs       10000 non-null  float64\n",
      " 11  new_sentiment_label  10000 non-null  int64  \n",
      "dtypes: float64(2), int64(4), object(6)\n",
      "memory usage: 937.6+ KB\n"
     ]
    }
   ],
   "source": [
    "df2.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a94b44e7",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "id": "57550d86",
   "metadata": {},
   "source": [
    "### 数据分析"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "b371ad61",
   "metadata": {},
   "source": [
    "#### 总体评论倾向"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "367279aa",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1    60.15%\n",
       "0    39.85%\n",
       "Name: new_sentiment_label, dtype: object"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "(df2.new_sentiment_label.value_counts(normalize=True)).map(lambda x:\"{:.2%}\".format(x))"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "95cd8a8d",
   "metadata": {},
   "source": [
    "#### 评论分布"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "0081308c",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "其他    73.53%\n",
       "配置    13.84%\n",
       "音质     5.97%\n",
       "价格     4.27%\n",
       "功能     1.02%\n",
       "外形     0.95%\n",
       "舒适     0.42%\n",
       "Name: subject, dtype: object"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "(df2.subject.value_counts(normalize=True)).map(lambda x:\"{:.2%}\".format(x))"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "2d014f1c",
   "metadata": {},
   "source": [
    "#### 各分布的情感倾向"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "6a354977",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>subject</th>\n",
       "      <th>sentiment_key</th>\n",
       "      <th>new_sentiment_label</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>价格</td>\n",
       "      <td>negative</td>\n",
       "      <td>222</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>价格</td>\n",
       "      <td>positive</td>\n",
       "      <td>205</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>其他</td>\n",
       "      <td>negative</td>\n",
       "      <td>3787</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>其他</td>\n",
       "      <td>positive</td>\n",
       "      <td>3566</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>功能</td>\n",
       "      <td>negative</td>\n",
       "      <td>57</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>功能</td>\n",
       "      <td>positive</td>\n",
       "      <td>45</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>外形</td>\n",
       "      <td>negative</td>\n",
       "      <td>40</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>外形</td>\n",
       "      <td>positive</td>\n",
       "      <td>55</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>舒适</td>\n",
       "      <td>negative</td>\n",
       "      <td>26</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>舒适</td>\n",
       "      <td>positive</td>\n",
       "      <td>16</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>配置</td>\n",
       "      <td>negative</td>\n",
       "      <td>872</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>配置</td>\n",
       "      <td>positive</td>\n",
       "      <td>512</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>音质</td>\n",
       "      <td>negative</td>\n",
       "      <td>282</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13</th>\n",
       "      <td>音质</td>\n",
       "      <td>positive</td>\n",
       "      <td>315</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   subject sentiment_key  new_sentiment_label\n",
       "0       价格      negative                  222\n",
       "1       价格      positive                  205\n",
       "2       其他      negative                 3787\n",
       "3       其他      positive                 3566\n",
       "4       功能      negative                   57\n",
       "5       功能      positive                   45\n",
       "6       外形      negative                   40\n",
       "7       外形      positive                   55\n",
       "8       舒适      negative                   26\n",
       "9       舒适      positive                   16\n",
       "10      配置      negative                  872\n",
       "11      配置      positive                  512\n",
       "12      音质      negative                  282\n",
       "13      音质      positive                  315"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df2.groupby(by=[\"subject\",\"sentiment_key\"],as_index=False).agg({\"new_sentiment_label\":\"count\"})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "7d94bfb0",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([7353, 1384,  597,  427,  102,   95,   42], dtype=int64)"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df2.subject.value_counts().values"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "d8a8d3ad",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index(['其他', '配置', '音质', '价格', '功能', '外形', '舒适'], dtype='object')"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df2.subject.value_counts().index"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "6a7b332f",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "\n",
       "<script>\n",
       "    require.config({\n",
       "        paths: {\n",
       "            'echarts':'https://assets.pyecharts.org/assets/echarts.min'\n",
       "        }\n",
       "    });\n",
       "</script>\n",
       "\n",
       "        <div id=\"ba12e90e900a4954b9d86e6145c3ea2f\" style=\"width:900px; height:500px;\"></div>\n",
       "\n",
       "<script>\n",
       "        require(['echarts'], function(echarts) {\n",
       "                var chart_ba12e90e900a4954b9d86e6145c3ea2f = echarts.init(\n",
       "                    document.getElementById('ba12e90e900a4954b9d86e6145c3ea2f'), 'white', {renderer: 'canvas'});\n",
       "                var option_ba12e90e900a4954b9d86e6145c3ea2f = {\n",
       "    \"animation\": true,\n",
       "    \"animationThreshold\": 2000,\n",
       "    \"animationDuration\": 1000,\n",
       "    \"animationEasing\": \"cubicOut\",\n",
       "    \"animationDelay\": 0,\n",
       "    \"animationDurationUpdate\": 300,\n",
       "    \"animationEasingUpdate\": \"cubicOut\",\n",
       "    \"animationDelayUpdate\": 0,\n",
       "    \"color\": [\n",
       "        \"#c23531\",\n",
       "        \"#2f4554\",\n",
       "        \"#61a0a8\",\n",
       "        \"#d48265\",\n",
       "        \"#749f83\",\n",
       "        \"#ca8622\",\n",
       "        \"#bda29a\",\n",
       "        \"#6e7074\",\n",
       "        \"#546570\",\n",
       "        \"#c4ccd3\",\n",
       "        \"#f05b72\",\n",
       "        \"#ef5b9c\",\n",
       "        \"#f47920\",\n",
       "        \"#905a3d\",\n",
       "        \"#fab27b\",\n",
       "        \"#2a5caa\",\n",
       "        \"#444693\",\n",
       "        \"#726930\",\n",
       "        \"#b2d235\",\n",
       "        \"#6d8346\",\n",
       "        \"#ac6767\",\n",
       "        \"#1d953f\",\n",
       "        \"#6950a1\",\n",
       "        \"#918597\"\n",
       "    ],\n",
       "    \"series\": [\n",
       "        {\n",
       "            \"type\": \"bar\",\n",
       "            \"name\": \"\\u8bc4\\u8bba\\u5206\\u5e03\",\n",
       "            \"legendHoverLink\": true,\n",
       "            \"data\": [\n",
       "                1384,\n",
       "                597,\n",
       "                427,\n",
       "                102,\n",
       "                95,\n",
       "                42\n",
       "            ],\n",
       "            \"showBackground\": false,\n",
       "            \"barMinHeight\": 0,\n",
       "            \"barCategoryGap\": \"20%\",\n",
       "            \"barGap\": \"30%\",\n",
       "            \"large\": false,\n",
       "            \"largeThreshold\": 400,\n",
       "            \"seriesLayoutBy\": \"column\",\n",
       "            \"datasetIndex\": 0,\n",
       "            \"clip\": true,\n",
       "            \"zlevel\": 0,\n",
       "            \"z\": 2,\n",
       "            \"label\": {\n",
       "                \"show\": true,\n",
       "                \"position\": \"top\",\n",
       "                \"margin\": 8\n",
       "            }\n",
       "        }\n",
       "    ],\n",
       "    \"legend\": [\n",
       "        {\n",
       "            \"data\": [\n",
       "                \"\\u8bc4\\u8bba\\u5206\\u5e03\"\n",
       "            ],\n",
       "            \"selected\": {\n",
       "                \"\\u8bc4\\u8bba\\u5206\\u5e03\": true\n",
       "            },\n",
       "            \"show\": true,\n",
       "            \"padding\": 5,\n",
       "            \"itemGap\": 10,\n",
       "            \"itemWidth\": 25,\n",
       "            \"itemHeight\": 14\n",
       "        }\n",
       "    ],\n",
       "    \"tooltip\": {\n",
       "        \"show\": true,\n",
       "        \"trigger\": \"item\",\n",
       "        \"triggerOn\": \"mousemove|click\",\n",
       "        \"axisPointer\": {\n",
       "            \"type\": \"line\"\n",
       "        },\n",
       "        \"showContent\": true,\n",
       "        \"alwaysShowContent\": false,\n",
       "        \"showDelay\": 0,\n",
       "        \"hideDelay\": 100,\n",
       "        \"textStyle\": {\n",
       "            \"fontSize\": 14\n",
       "        },\n",
       "        \"borderWidth\": 0,\n",
       "        \"padding\": 5\n",
       "    },\n",
       "    \"xAxis\": [\n",
       "        {\n",
       "            \"show\": true,\n",
       "            \"scale\": false,\n",
       "            \"nameLocation\": \"end\",\n",
       "            \"nameGap\": 15,\n",
       "            \"gridIndex\": 0,\n",
       "            \"axisLabel\": {\n",
       "                \"show\": true,\n",
       "                \"position\": \"top\",\n",
       "                \"rotate\": -15,\n",
       "                \"margin\": 8\n",
       "            },\n",
       "            \"inverse\": false,\n",
       "            \"offset\": 0,\n",
       "            \"splitNumber\": 5,\n",
       "            \"minInterval\": 0,\n",
       "            \"splitLine\": {\n",
       "                \"show\": false,\n",
       "                \"lineStyle\": {\n",
       "                    \"show\": true,\n",
       "                    \"width\": 1,\n",
       "                    \"opacity\": 1,\n",
       "                    \"curveness\": 0,\n",
       "                    \"type\": \"solid\"\n",
       "                }\n",
       "            },\n",
       "            \"data\": [\n",
       "                \"\\u914d\\u7f6e\",\n",
       "                \"\\u97f3\\u8d28\",\n",
       "                \"\\u4ef7\\u683c\",\n",
       "                \"\\u529f\\u80fd\",\n",
       "                \"\\u5916\\u5f62\",\n",
       "                \"\\u8212\\u9002\"\n",
       "            ]\n",
       "        }\n",
       "    ],\n",
       "    \"yAxis\": [\n",
       "        {\n",
       "            \"show\": true,\n",
       "            \"scale\": false,\n",
       "            \"nameLocation\": \"end\",\n",
       "            \"nameGap\": 15,\n",
       "            \"gridIndex\": 0,\n",
       "            \"inverse\": false,\n",
       "            \"offset\": 0,\n",
       "            \"splitNumber\": 5,\n",
       "            \"minInterval\": 0,\n",
       "            \"splitLine\": {\n",
       "                \"show\": false,\n",
       "                \"lineStyle\": {\n",
       "                    \"show\": true,\n",
       "                    \"width\": 1,\n",
       "                    \"opacity\": 1,\n",
       "                    \"curveness\": 0,\n",
       "                    \"type\": \"solid\"\n",
       "                }\n",
       "            }\n",
       "        }\n",
       "    ],\n",
       "    \"title\": [\n",
       "        {\n",
       "            \"text\": \"\\u8bc4\\u8bba\\u5206\\u5e03\",\n",
       "            \"padding\": 5,\n",
       "            \"itemGap\": 10\n",
       "        }\n",
       "    ]\n",
       "};\n",
       "                chart_ba12e90e900a4954b9d86e6145c3ea2f.setOption(option_ba12e90e900a4954b9d86e6145c3ea2f);\n",
       "        });\n",
       "    </script>\n"
      ],
      "text/plain": [
       "<pyecharts.render.display.HTML at 0x1e1f7318820>"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from pyecharts import options as opts\n",
    "from pyecharts.charts import Bar\n",
    "\n",
    "x_name = ['配置', '音质', '价格', '功能', '外形', '舒适']\n",
    "y_value = [1384,  597,  427,  102,   95,   42]\n",
    "c = (\n",
    "    Bar()\n",
    "    .add_xaxis(x_name)\n",
    "    .add_yaxis(\"评论分布\",y_value)\n",
    "    .set_global_opts(\n",
    "        xaxis_opts=opts.AxisOpts(axislabel_opts=opts.LabelOpts(rotate=-15)),\n",
    "        title_opts=opts.TitleOpts(title=\"评论分布\"),\n",
    "    )\n",
    ")\n",
    "c.render_notebook()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7690b5f5",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "id": "a497e3d7",
   "metadata": {},
   "source": [
    "### 评论分词"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "c397dd59",
   "metadata": {},
   "source": [
    "这里使用百度飞浆的LAC分词模型"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "9d4609a5",
   "metadata": {},
   "source": [
    ">\n",
    "> 飞浆LAC分词模型：[官网地址](https://www.paddlepaddle.org.cn/hubdetail?name=lac&en_category=LexicalAnalysis)\n",
    ">\n",
    ">\n",
    ">"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "033313e5",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\u001b[33m[2022-01-19 15:40:18,649] [ WARNING]\u001b[0m - The _initialize method in HubModule will soon be deprecated, you can use the __init__() to handle the initialization of the object\u001b[0m\n"
     ]
    }
   ],
   "source": [
    "import paddlehub as hub\n",
    "\n",
    "# 加载模型\n",
    "lac = hub.Module(name=\"lac\")\n",
    "test_text = df[\"content\"].tolist()\n",
    "# 模型分词结果\n",
    "results = lac.cut(text=test_text, use_gpu=False, batch_size=1, return_tag=True)\n",
    "# 将所有分词保存到一个列表中\n",
    "result_word_list = []\n",
    "for result in results:\n",
    "    result_word_list.extend(result[\"word\"])"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "ebab1fa5",
   "metadata": {},
   "source": [
    "#### 去除停用词"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "id": "ae160bc4",
   "metadata": {},
   "outputs": [],
   "source": [
    "# 停用词数据\n",
    "with open(\"./stop_words.txt\",\"r\",encoding=\"utf-8\") as f:\n",
    "    # 用 strip 删除换行符 /n\n",
    "    stop_word_list = [s.strip() for s in f.readlines()]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "id": "bedcfb73",
   "metadata": {},
   "outputs": [],
   "source": [
    "# 统计每个词出现的次数\n",
    "word_cloud_dict = {}\n",
    "for w in result_word_list:\n",
    "    # 如果在停用词中就不统计\n",
    "    if w in stop_word_list:\n",
    "        continue\n",
    "    if w in word_cloud_dict.keys():\n",
    "        word_cloud_dict[w] = word_cloud_dict[w]+1\n",
    "    else:\n",
    "        word_cloud_dict[w] = 1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "id": "37513953",
   "metadata": {},
   "outputs": [],
   "source": [
    "# 制作词云图的数据\n",
    "word_cloud_data = sorted(word_cloud_dict.items(),key=lambda x:x[1],reverse=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "82d6d0bd",
   "metadata": {},
   "source": [
    "#### 绘制词云图"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "id": "93a6e20a",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pyecharts.options as opts\n",
    "from pyecharts.charts import WordCloud\n",
    "\n",
    "\n",
    "word_cloud = (\n",
    "    WordCloud()\n",
    "    .add(series_name=\"评论热词\", data_pair=word_cloud_data, word_size_range=[6, 66])\n",
    "    .set_global_opts(\n",
    "        title_opts=opts.TitleOpts(\n",
    "            title=\"评论热词\", title_textstyle_opts=opts.TextStyleOpts(font_size=23)\n",
    "        ),\n",
    "        tooltip_opts=opts.TooltipOpts(is_show=True),\n",
    "    )\n",
    ")\n",
    "word_cloud.render_notebook()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "77f901f1",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.8"
  },
  "toc": {
   "base_numbering": 1,
   "nav_menu": {},
   "number_sections": true,
   "sideBar": true,
   "skip_h1_title": false,
   "title_cell": "Table of Contents",
   "title_sidebar": "Contents",
   "toc_cell": false,
   "toc_position": {
    "height": "calc(100% - 180px)",
    "left": "10px",
    "top": "150px",
    "width": "165px"
   },
   "toc_section_display": true,
   "toc_window_display": true
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
